SNP calling

Basa data:

Balsamorhiza sagittata
Balsamorhiza hookeri
BAHO3 x BASA hybrid 
Lupinus wyethii (Outgroup)

Located in:


Raw data located in: 
/archive/parchman_lab/rawdata_to_backup/BASA_CHDO_rawNOVASEQ/Library2-BASA_S2_L002_R1_001.fastq.gz

Cleaned and demultiplexed .fastq files by individual:
/working/tfaske/balsam/demult/fastq (BASA)
/working/tfaske/balsam/demult/outgroup_fastq (outgroup)


Generating the reference with dDocent:


mkdir refOpt (place here 4 individuals per pop)
./ReferenceOpt.sh 4 10 4 10 SE 5 0.95 0.99 0.005

#####
R
library(ggplot2)
data.table <- read.table("kopt.data", header = FALSE, col.names= c("k1","k2","Similarity", "Contigs"))
data.table$K1K2 <- paste(data.table$k1, data.table$k2, sep=",")
df=data.frame(data.table)
df$K1K2 <- as.factor(df$K1K2)
p <- ggplot(df, aes(x=Similarity, y=Contigs, group=K1K2)) + scale_x_continuous(breaks=seq(0.8,0.98,0.01)) + geom_line(aes(colour = K1K2))
p
ggsave("kvalues.pdf",p,height=8,width = 10,units = 'in')
#####

Run dDocent on this subset with the correct assembly parameters (skipping mapping and snp calling)

./RefMapOpt.sh 4 6 4 6 0.975 SE 10
./compress.sh (compress intermediate files)

Move the reads and reference files to the main directory
Run dDocent on the full data set, skipping trimming, assembly, snp calling


SNP calling:


Run ./bwa_sort.sh
INDS=($(for i in /working/mascaro/basa/outgroup_2/*.F.fq.gz; do echo $(basename ${i%.F.fq.gz*}); done))

module load bwa/0.7.8
module load samtools/1.3

for IND in ${INDS[@]};
do
    # declare variables
    REF=/working/mascaro/basa/outgroup_2/reference.fasta
    FORWARD=/working/mascaro/basa/outgroup_2/${IND}.F.fq.gz
    OUTPUT=/working/mascaro/basa/outgroup_2/assembly/${IND}_sort.bam
    # then align and sort
    echo "Aligning $IND with bwa"
    bwa mem -M -t 10 $REF $FORWARD \
     | samtools view -b | \
    samtools sort -T ${IND} > $OUTPUT

done

###bcftools
Run ./bcftools.sh
REF=/working/mascaro/basa/outgroup_2/reference.fasta

module load bcftools/1.9
bcftools mpileup -a AD,DP,SP -Ou -f $REF \
./assembly/*_sort.bam | bcftools call -f GQ,GP \
-mO z -o ./basa.vcf.gz


Filtering:


#### 
#The obtained vcf file:
#Final.recode.vcf: 

# Statistics with Vcftools:
vcftools --gzvcf basa.vcf.gz --site-quality --out /working/mascaro/basa/outgroup_2/filter/quality
vcftools --gzvcf basa.vcf.gz --freq2 --out /working/mascaro/basa/outgroup_2/filter --max-alleles 2
vcftools --gzvcf basa.vcf.gz --depth --out /working/mascaro/basa/outgroup_2/filter/meandepthind
vcftools --gzvcf basa.vcf.gz --site-mean-depth --out /working/mascaro/basa/outgroup_2/filter/meandepsite
vcftools --gzvcf basa.vcf.gz --missing-indv --out /working/mascaro/basa/outgroup_2/filter/missing
vcftools --gzvcf basa.vcf.gz --missing-site --out /working/mascaro/basa/outgroup_2/filter/missingsite

##### Examining statistics in R
R
library(ggplot2)
library(tidyverse)

var_qual <- read_delim("quality.lqual", delim = "\t",col_names = c("chr", "pos", "qual"), skip = 1)
a <- ggplot(var_qual, aes(qual)) + geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3)
a + theme_light()
ggsave("quality.pdf",a,height=8,width = 10,units = 'in')

var_depth <- read_delim("meandepthind.idepth", delim = "\t", col_names = c("chr", "pos", "mean_depth", "var_depth"), skip =1)
a <- ggplot(var_depth, aes(mean_depth)) + geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3)
a + theme_light()
summary(var_depth$mean_depth)
a + theme_light() + xlim(0, 100)
ggsave("meandepth_ind.pdf",a,height=8,width = 10,units = 'in')

var_miss <- read_delim("missingsite.lmiss", delim = "\t",col_names = c("chr", "pos", "nchr", "nfiltered", "nmiss","fmiss"), skip = 1)
a <- ggplot(var_miss, aes(fmiss)) + geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3)
a + theme_light()
summary(var_miss$fmiss)
ggsave("missing.pdf",a,height=8,width = 10,units = 'in')

ind_depth <- read_delim("meandepsite.ldepth.mean", delim = "\t", col_names = c("ind", "nsites", "depth"), skip =1)
a <- ggplot(ind_depth, aes(depth)) + geom_histogram(fill = "dodgerblue1", colour = "black", alpha = 0.3)
a + theme_light()
ggsave("meandepth_site.pdf",a,height=8,width = 10,units = 'in')

ind_miss  <- read_delim("missing.imiss", delim = "\t",col_names = c("ind", "ndata", "nfiltered", "nmiss","fmiss"), skip = 1)
a <- ggplot(ind_miss, aes(fmiss)) + geom_histogram(fill = "dodgerblue1", colour = "black", alpha = 0.3)
a + theme_light()
ggsave("missing.pdf",a,height=8,width = 10,units = 'in')
#######

# Filtering: 
vcftools --vcf basa.vcf.gz --maf 0.05 --max-missing 0.6 --minQ 30 --min-meanDP 5 --max-meanDP 80 --minDP 5 --maxDP 80 --recode --out basa_filtered

awk '$5 > 0.5' missing.imiss | cut -f1 > lowDP.indv

vcftools --vcf basa_filtered.vcf.recode.vcf --remove lowDP.indv --recode --recode-INFO-all --out variants_clean

vcftools --vcf variants_clean.recode.vcf --out basa_filtered_final --recode --remove-filtered-all

# Statistics with Vcftools:
vcftools --gzvcf basa_filtered_final.recode.vcf --site-quality --out quality
vcftools --gzvcf basa_filtered_final.recode.vcf --freq2 --out $OUT --max-alleles 2
vcftools --gzvcf basa_filtered_final.recode.vcf --depth --out meandepthind
vcftools --gzvcf basa_filtered_final.recode.vcf --site-mean-depth --out meandepsite
vcftools --gzvcf basa_filtered_final.recode.vcff --missing-indv --out missing
vcftools --gzvcf basa_filtered_final.recode.vcf --missing-site --out missingsite

# Filtering again:
vcftools --vcf basa_filtered_final.recode.vcf --maf 0.05 --max-missing 0.75 --m
inQ 30 --min-meanDP 10 --max-meanDP 80 --minDP 10 --maxDP 80 --recode --out basa_final2 

# Statistics with Vcftools:
vcftools --gzvcf basa_filtered2.recode.vcf --site-quality --out quality
vcftools --gzvcf basa_filtered2.recode.vcf --freq2 --out $OUT --max-alleles 2
vcftools --gzvcf basa_filtered2.recode.vcf --depth --out meandepthind
vcftools --gzvcf basa_filtered2.recode.vcf --site-mean-depth --out meandepsite
vcftools --gzvcf basa_filtered2.recode.vcf --missing-indv --out missing
vcftools --gzvcf basa_filtered2.recode.vcf --missing-site --out missingsite

After filtering: 27621 loci and 379 individuals


Genetic diversity and differentiation

Genetic diversity statistics:
Overall, excess of heterozygotes (Ho>He; negative Fis values), high nucleotide diversity, and low Fst values. AMOVA showing only 22 % of the variance explained by population.


library(vcfR)
library("adegenet")
library("hierfstat")
library("pegas")
basa.vcf <- read.vcfR("basa_final.vcf")
basa.vcf

##Fis, Fst
my_genind <- vcfR2genind(basa.vcf)
x<- my_genind 
pops <- as.factor(c(pops))

#Population specific Fis:
myData.hfstat <- genind2hierfstat(my_genind, pop = pops)
basicstat <- basic.stats(myData.hfstat, diploid = TRUE, digits = 4) 
basicstat$Fis
write.csv(basicstat$Fis, "Fis.csv")

##Bootstrapping over loci of population's Fis
boot.ppfis(myData.hfstat)
#Nei's Pairwise FSTs: 
x <- genet.dist(myData.hfstat,diploid=TRUE,method="Ds")# Nei’s standard genetic distance
fst <- as.matrix(x)
write.table(fst, "Fst.txt")
##Bootstrapping over loci of pairwise Fst
#boot.ppfst(myData.hfstat)

basicstat$Ho #observed
write.csv(basicstat$Ho, "HO.csv")
basicstat$Hs #expected
write.csv(basicstat$Hs, "Hs.csv")
basicstat

###########################################
##vcftools Pi and TajimaD
#!/bin/sh
# .vcf file
# .pop file (unique names of pops, one per line)
# .map file (individual to population mapping file — 2 columns)

#cat basa.pop | while read line;
#do
#grep "$line" basa.map > $line.pop
#done

#for p in *.pop
#do
#vcftools --vcf basa_final.vcf --keep $p --site-pi --out $p
#done

#for p in *.pop
#do
#vcftools --vcf basa_final.vcf --keep $p --TajimaD 100 --out $p
#done

#AMOVA:
library(vcfR)
library("adegenet")
library("hierfstat")
library("pegas")
library("poppr")
library("magrittr")
library(ape)

# vcf to genind
BASA.VCF <- read.vcfR("basa.vcf")
my_genind <- vcfR2genind(BASA.VCF)

### Give a data set with stratification (individual, populations, subpopulations..)
file.hier = read.table("ind_pop.txt", header=T)
strata(my_genind) = file.hier
my_genind

## amova with stratifications
amova.res.95 = poppr.amova(acth.hier.strat, ~pop/subpop, cutoff=0.95)
amova.res.95
write.table(amova.res.95$results, sep = ",", file = "results_amova.csv")
write.table(amova.res.95$componentsofcovariance, sep = ",", file = "Components_covariance.csv")
write.table(amova.res.95$statphi, sep = ",", file = "phi.csv")

## To test if populations are significantly different
amova.test.95 = randtest(amova.res.95)
amova.test.95


Table 1. He, Ho, Fis, Pi, Tajima D
Pop He Ho Fis π TajimaD
TM 0.284 0.428 -0.385 0.241 0.313
KB 0.288 0.430 -0.357 0.239 0.339
CC 0.291 0.432 -0.354 0.246 0.275
KA 0.291 0.432 -0.372 0.249 0.295
RP 0.290 0.432 -0.364 0.251 0.272
BH 0.289 0.434 -0.380 0.231 0.356
LL 0.288 0.434 -0.385 0.242 0.286
CF 0.303 0.435 -0.328 0.265 0.222
CL 0.290 0.437 -0.377 0.246 0.333
SP 0.293 0.437 -0.373 0.251 0.271
EC 0.295 0.442 -0.363 0.253 0.265
GA 0.298 0.442 -0.358 0.262 0.242
AN 0.298 0.444 -0.358 0.258 0.308
SL 0.300 0.444 -0.358 0.261 0.211
SC 0.300 0.444 -0.349 0.265 0.227
LC 0.299 0.445 -0.362 0.251 0.351
AS 0.300 0.445 -0.352 0.271 0.284
BB 0.305 0.446 -0.339 0.268 0.262
WC 0.304 0.447 -0.342 0.271 0.272
JV 0.304 0.448 -0.346 0.264 0.285
SD 0.302 0.449 -0.355 0.271 0.299
KM 0.301 0.449 -0.382 0.281 0.367
RC 0.303 0.450 -0.351 0.272 0.265
RL 0.304 0.450 -0.348 0.272 0.283
GJ 0.301 0.451 -0.362 0.269 0.263
MM 0.303 0.452 -0.365 0.272 0.311
GB 0.307 0.453 -0.347 0.277 0.242
CN 0.308 0.454 -0.347 0.272 0.261
SY 0.306 0.455 -0.358 0.268 0.274
MT 0.303 0.456 -0.372 0.276 0.324
LM 0.306 0.456 -0.356 0.279 0.222
QG 0.307 0.457 -0.359 0.271 0.311


.inline-btns p {
  display: inline;
}
library(downloadthis)
Fst_results <- read.csv("files/Fst.csv")
d_btn <- Fst_results %>% 
  download_this(
    path = system.file("files/", package = "downloadthis"),
    output_name = "Fst values",
    output_extension = ".csv",
    button_label = "Fst",
    button_type = "default",
    has_icon = TRUE,
    icon = "fa fa-save"
  )


Fst results here

Table 2. Fst
X AN AS BB BH CC CF CL CN EC GA GB GJ JV KA KB KM LC LL LM MM MT QG RC RL RP SC SD SL SP SY TM WC
AN 0.0000000 0.0212699 0.0200213 0.0346840 0.0327182 0.0267836 0.0246469 0.0189064 0.0190003 0.0266201 0.0175230 0.0164721 0.0162117 0.0355746 0.0303433 0.0285037 0.0221734 0.0338473 0.0188846 0.0178398 0.0170678 0.0179127 0.0164681 0.0181886 0.0323123 0.0207600 0.0178579 0.0283760 0.0337146 0.0169339 0.0363946 0.0228502
AS 0.0212699 0.0000000 0.0186616 0.0329252 0.0235971 0.0224353 0.0253128 0.0183065 0.0245430 0.0258348 0.0176178 0.0197878 0.0189161 0.0335354 0.0296384 0.0260247 0.0236629 0.0337474 0.0174792 0.0202507 0.0204187 0.0200585 0.0185294 0.0197295 0.0297079 0.0201100 0.0185614 0.0272561 0.0262603 0.0216470 0.0280640 0.0152160
BB 0.0200213 0.0186616 0.0000000 0.0319076 0.0244552 0.0214392 0.0248414 0.0180955 0.0231898 0.0247534 0.0179690 0.0187357 0.0174954 0.0321670 0.0286313 0.0282892 0.0223153 0.0331018 0.0182653 0.0198932 0.0202839 0.0192087 0.0166406 0.0183536 0.0289344 0.0155876 0.0196366 0.0263040 0.0267735 0.0195137 0.0290556 0.0178810
BH 0.0346840 0.0329252 0.0319076 0.0000000 0.0341948 0.0348293 0.0410418 0.0333634 0.0331291 0.0158469 0.0328156 0.0339422 0.0340365 0.0252982 0.0261380 0.0397149 0.0390792 0.0295467 0.0321425 0.0365994 0.0370556 0.0365161 0.0333541 0.0299791 0.0154639 0.0342866 0.0354557 0.0220385 0.0350017 0.0367025 0.0365534 0.0304946
CC 0.0327182 0.0235971 0.0244552 0.0341948 0.0000000 0.0237590 0.0374692 0.0258122 0.0321994 0.0289741 0.0265930 0.0314310 0.0308357 0.0325807 0.0299775 0.0322526 0.0352532 0.0345354 0.0228768 0.0331576 0.0342563 0.0327005 0.0293691 0.0297479 0.0296424 0.0282460 0.0303569 0.0288153 0.0189424 0.0340537 0.0198847 0.0168270
CF 0.0267836 0.0224353 0.0214392 0.0348293 0.0237590 0.0000000 0.0294367 0.0231731 0.0279445 0.0281877 0.0232136 0.0245153 0.0219338 0.0340184 0.0315772 0.0322944 0.0277815 0.0364529 0.0231211 0.0258244 0.0272402 0.0251380 0.0218596 0.0236070 0.0300860 0.0206356 0.0260934 0.0295962 0.0262150 0.0268463 0.0277910 0.0209021
CL 0.0246469 0.0253128 0.0248414 0.0410418 0.0374692 0.0294367 0.0000000 0.0251011 0.0299262 0.0331434 0.0231170 0.0231664 0.0216017 0.0409459 0.0380001 0.0359048 0.0166704 0.0425012 0.0262426 0.0236694 0.0238168 0.0231343 0.0217015 0.0238932 0.0375295 0.0237068 0.0242312 0.0350436 0.0397157 0.0242384 0.0427709 0.0276300
CN 0.0189064 0.0183065 0.0180955 0.0333634 0.0258122 0.0231731 0.0251011 0.0000000 0.0229948 0.0253191 0.0149444 0.0173068 0.0171130 0.0341757 0.0295207 0.0233035 0.0221045 0.0332522 0.0130664 0.0180485 0.0172908 0.0177681 0.0162922 0.0178054 0.0305035 0.0193922 0.0175124 0.0271138 0.0276345 0.0174463 0.0301545 0.0171157
EC 0.0190003 0.0245430 0.0231898 0.0331291 0.0321994 0.0279445 0.0299262 0.0229948 0.0000000 0.0263762 0.0226038 0.0214934 0.0211447 0.0335061 0.0287778 0.0328450 0.0274776 0.0324182 0.0233297 0.0234037 0.0230408 0.0231056 0.0212814 0.0224650 0.0305677 0.0242452 0.0236657 0.0273561 0.0339258 0.0228591 0.0364258 0.0244998
GA 0.0266201 0.0258348 0.0247534 0.0158469 0.0289741 0.0281877 0.0331434 0.0253191 0.0263762 0.0000000 0.0251998 0.0256548 0.0263371 0.0236291 0.0221377 0.0338229 0.0312204 0.0257589 0.0239398 0.0283194 0.0280406 0.0278849 0.0253777 0.0224607 0.0151134 0.0270017 0.0268538 0.0184739 0.0294554 0.0275287 0.0315750 0.0235851
GB 0.0175230 0.0176178 0.0179690 0.0328156 0.0265930 0.0232136 0.0231170 0.0149444 0.0226038 0.0251998 0.0000000 0.0167527 0.0163814 0.0337995 0.0289854 0.0245593 0.0206033 0.0324782 0.0145730 0.0166393 0.0159460 0.0162265 0.0153821 0.0171601 0.0304754 0.0192077 0.0164729 0.0266545 0.0282273 0.0161405 0.0305152 0.0173996
GJ 0.0164721 0.0197878 0.0187357 0.0339422 0.0314310 0.0245153 0.0231664 0.0173068 0.0214934 0.0256548 0.0167527 0.0000000 0.0151043 0.0353594 0.0304644 0.0271990 0.0212521 0.0339120 0.0176500 0.0166582 0.0158933 0.0165035 0.0137941 0.0161212 0.0314389 0.0182391 0.0166257 0.0276478 0.0327586 0.0149816 0.0354195 0.0214097
JV 0.0162117 0.0189161 0.0174954 0.0340365 0.0308357 0.0219338 0.0216017 0.0171130 0.0211447 0.0263371 0.0163814 0.0151043 0.0000000 0.0344323 0.0307617 0.0287256 0.0191716 0.0352049 0.0189470 0.0157196 0.0164529 0.0153160 0.0131920 0.0159127 0.0308224 0.0156749 0.0179585 0.0279383 0.0339841 0.0166767 0.0363474 0.0213632
KA 0.0355746 0.0335354 0.0321670 0.0252982 0.0325807 0.0340184 0.0409459 0.0341757 0.0335061 0.0236291 0.0337995 0.0353594 0.0344323 0.0000000 0.0190681 0.0414175 0.0389326 0.0235086 0.0329279 0.0374017 0.0383461 0.0366842 0.0342726 0.0317150 0.0193344 0.0337279 0.0370079 0.0175659 0.0345171 0.0383341 0.0358064 0.0298833
KB 0.0303433 0.0296384 0.0286313 0.0261380 0.0299775 0.0315772 0.0380001 0.0295207 0.0287778 0.0221377 0.0289854 0.0304644 0.0307617 0.0190681 0.0000000 0.0368059 0.0359200 0.0117009 0.0277477 0.0323038 0.0326546 0.0324584 0.0301154 0.0282422 0.0217759 0.0314428 0.0311908 0.0167801 0.0306686 0.0325043 0.0325571 0.0264465
KM 0.0285037 0.0260247 0.0282892 0.0397149 0.0322526 0.0322944 0.0359048 0.0233035 0.0328450 0.0338229 0.0245593 0.0271990 0.0287256 0.0414175 0.0368059 0.0000000 0.0352685 0.0398091 0.0221475 0.0294939 0.0284646 0.0302520 0.0271601 0.0284409 0.0384818 0.0309566 0.0264761 0.0357467 0.0334697 0.0279264 0.0353186 0.0258690
LC 0.0221734 0.0236629 0.0223153 0.0390792 0.0352532 0.0277815 0.0166704 0.0221045 0.0274776 0.0312204 0.0206033 0.0212521 0.0191716 0.0389326 0.0359200 0.0352685 0.0000000 0.0406755 0.0236781 0.0207346 0.0214815 0.0193848 0.0195040 0.0214650 0.0355828 0.0210133 0.0221636 0.0324812 0.0372541 0.0218466 0.0410061 0.0251646
LL 0.0338473 0.0337474 0.0331018 0.0295467 0.0345354 0.0364529 0.0425012 0.0332522 0.0324182 0.0257589 0.0324782 0.0339120 0.0352049 0.0235086 0.0117009 0.0398091 0.0406755 0.0000000 0.0309655 0.0359579 0.0358022 0.0360280 0.0341083 0.0318546 0.0261726 0.0363706 0.0344018 0.0211644 0.0340164 0.0355534 0.0361843 0.0305224
LM 0.0188846 0.0174792 0.0182653 0.0321425 0.0228768 0.0231211 0.0262426 0.0130664 0.0233297 0.0239398 0.0145730 0.0176500 0.0189470 0.0329279 0.0277477 0.0221475 0.0236781 0.0309655 0.0000000 0.0188936 0.0179248 0.0188001 0.0172917 0.0183495 0.0295287 0.0207955 0.0168323 0.0256663 0.0238840 0.0177935 0.0264541 0.0149584
MM 0.0178398 0.0202507 0.0198932 0.0365994 0.0331576 0.0258244 0.0236694 0.0180485 0.0234037 0.0283194 0.0166393 0.0166582 0.0157196 0.0374017 0.0323038 0.0294939 0.0207346 0.0359579 0.0188936 0.0000000 0.0125471 0.0124966 0.0154447 0.0177282 0.0337550 0.0192789 0.0166609 0.0293564 0.0347920 0.0166414 0.0375904 0.0223864
MT 0.0170678 0.0204187 0.0202839 0.0370556 0.0342563 0.0272402 0.0238168 0.0172908 0.0230408 0.0280406 0.0159460 0.0158933 0.0164529 0.0383461 0.0326546 0.0284646 0.0214815 0.0358022 0.0179248 0.0125471 0.0000000 0.0125909 0.0156450 0.0177622 0.0349554 0.0205679 0.0152600 0.0297690 0.0350223 0.0144608 0.0381636 0.0227158
QG 0.0179127 0.0200585 0.0192087 0.0365161 0.0327005 0.0251380 0.0231343 0.0177681 0.0231056 0.0278849 0.0162265 0.0165035 0.0153160 0.0366842 0.0324584 0.0302520 0.0193848 0.0360280 0.0188001 0.0124966 0.0125909 0.0000000 0.0150762 0.0174641 0.0333414 0.0184967 0.0165357 0.0293283 0.0343993 0.0161502 0.0377306 0.0215739
RC 0.0164681 0.0185294 0.0166406 0.0333541 0.0293691 0.0218596 0.0217015 0.0162922 0.0212814 0.0253777 0.0153821 0.0137941 0.0131920 0.0342726 0.0301154 0.0271601 0.0195040 0.0341083 0.0172917 0.0154447 0.0156450 0.0150762 0.0000000 0.0138413 0.0306514 0.0152993 0.0169273 0.0274372 0.0318366 0.0144455 0.0345417 0.0203048
RL 0.0181886 0.0197295 0.0183536 0.0299791 0.0297479 0.0236070 0.0238932 0.0178054 0.0224650 0.0224607 0.0171601 0.0161212 0.0159127 0.0317150 0.0282422 0.0284409 0.0214650 0.0318546 0.0183495 0.0177282 0.0177622 0.0174641 0.0138413 0.0000000 0.0275385 0.0181671 0.0185550 0.0253345 0.0313771 0.0167345 0.0345433 0.0207855
RP 0.0323123 0.0297079 0.0289344 0.0154639 0.0296424 0.0300860 0.0375295 0.0305035 0.0305677 0.0151134 0.0304754 0.0314389 0.0308224 0.0193344 0.0217759 0.0384818 0.0355828 0.0261726 0.0295287 0.0337550 0.0349554 0.0333414 0.0306514 0.0275385 0.0000000 0.0302712 0.0332392 0.0177010 0.0313996 0.0345974 0.0331327 0.0267652
SC 0.0207600 0.0201100 0.0155876 0.0342866 0.0282460 0.0206356 0.0237068 0.0193922 0.0242452 0.0270017 0.0192077 0.0182391 0.0156749 0.0337279 0.0314428 0.0309566 0.0210133 0.0363706 0.0207955 0.0192789 0.0205679 0.0184967 0.0152993 0.0181671 0.0302712 0.0000000 0.0214085 0.0286619 0.0325139 0.0200717 0.0347512 0.0207936
SD 0.0178579 0.0185614 0.0196366 0.0354557 0.0303569 0.0260934 0.0242312 0.0175124 0.0236657 0.0268538 0.0164729 0.0166257 0.0179585 0.0370079 0.0311908 0.0264761 0.0221636 0.0344018 0.0168323 0.0166609 0.0152600 0.0165357 0.0169273 0.0185550 0.0332392 0.0214085 0.0000000 0.0288431 0.0303812 0.0160337 0.0328389 0.0200819
SL 0.0283760 0.0272561 0.0263040 0.0220385 0.0288153 0.0295962 0.0350436 0.0271138 0.0273561 0.0184739 0.0266545 0.0276478 0.0279383 0.0175659 0.0167801 0.0357467 0.0324812 0.0211644 0.0256663 0.0293564 0.0297690 0.0293283 0.0274372 0.0253345 0.0177010 0.0286619 0.0288431 0.0000000 0.0293650 0.0301524 0.0314807 0.0244301
SP 0.0337146 0.0262603 0.0267735 0.0350017 0.0189424 0.0262150 0.0397157 0.0276345 0.0339258 0.0294554 0.0282273 0.0327586 0.0339841 0.0345171 0.0306686 0.0334697 0.0372541 0.0340164 0.0238840 0.0347920 0.0350223 0.0343993 0.0318366 0.0313771 0.0313996 0.0325139 0.0303812 0.0293650 0.0000000 0.0344131 0.0174452 0.0192507
SY 0.0169339 0.0216470 0.0195137 0.0367025 0.0340537 0.0268463 0.0242384 0.0174463 0.0228591 0.0275287 0.0161405 0.0149816 0.0166767 0.0383341 0.0325043 0.0279264 0.0218466 0.0355534 0.0177935 0.0166414 0.0144608 0.0161502 0.0144455 0.0167345 0.0345974 0.0200717 0.0160337 0.0301524 0.0344131 0.0000000 0.0375182 0.0229745
TM 0.0363946 0.0280640 0.0290556 0.0365534 0.0198847 0.0277910 0.0427709 0.0301545 0.0364258 0.0315750 0.0305152 0.0354195 0.0363474 0.0358064 0.0325571 0.0353186 0.0410061 0.0361843 0.0264541 0.0375904 0.0381636 0.0377306 0.0345417 0.0345433 0.0331327 0.0347512 0.0328389 0.0314807 0.0174452 0.0375182 0.0000000 0.0210962
WC 0.0228502 0.0152160 0.0178810 0.0304946 0.0168270 0.0209021 0.0276300 0.0171157 0.0244998 0.0235851 0.0173996 0.0214097 0.0213632 0.0298833 0.0264465 0.0258690 0.0251646 0.0305224 0.0149584 0.0223864 0.0227158 0.0215739 0.0203048 0.0207855 0.0267652 0.0207936 0.0200819 0.0244301 0.0192507 0.0229745 0.0210962 0.0000000


.inline-btns p {
  display: inline;
}
library(downloadthis)
Fst_results <- read.csv("files/Fst.csv")
d_btn <- Fst_results %>% 
  download_this(
    path = system.file("/home/caro/Escritorio/BASA_2/basa_Rmd/", package = "downloadthis"),
    output_name = "Genetic distances",
    output_extension = ".csv",
    button_label = "Genetic distances",
    button_type = "default",
    has_icon = TRUE,
    icon = "fa fa-save"
  )


Genetic distances results here



Table 3. AMOVA results
X Sigma X.
Variations Between samples 278.0909 22.07262
Variations Within samples 981.7995 77.92738
Total variations 1259.8903 100.00000

Spatial genetic structure

Map, PCA, UMAP, entropy, admixture piecharts:
B. sagittata map:


#devtools::install_github("dkahle/ggmap")
library(ggmap)
library(devtools)
require(ggplot2)
require(ggsci)
require(ggrepel)
library(data.table)
library(patchwork)

setwd("/home/caro/Escritorio/BASA_2/con_outgroup/Map/")
basacoord <- read.csv('coordinates.csv', header=T)
names(basacoord) <- c('Pop','Lat','Long')

map <- get_stamenmap(bbox = c(left = -120.5, bottom = 37.00, right = -109.0, top = 49.00),
zoom=8,scale = 3,maptype = 'terrain-background')
##zoom controls resolution, 10 takes forever

ggmap(map) + geom_point(data = basacoord, aes(x = Long, y = Lat),size=3,pch=21,fill="white",col="black") +
xlab("Longitude") + ylab("Latitude") +
coord_map(xlim=c(-120.5,-109.0),ylim=c(37.00,49.00)) #selects the range for x and y


group.colors   <- c(AN = "#F0F8FF", AS ="#E9967A", BH ="#FFEBCD", BB = "#F08080",CF = "#8B0000",CC ="#FF3030",CN ="#CD6090",CL = "#8B3A62", EC = "#FF82AB", 
                    GA ="#FFC0CB",GB ="#FFE1FF", GJ="#CDB5CD",JV ="#8B7B8B", KA ="#9B30FF",KB="#8470FF",KM ="#27408B",LM ="#87CEFA", 
                    LL="#104E8B",LC="#8DB6CD",MM="#6E7B8B",MT="#FFB90F",QG="#EE7600",RC="#FFF68F",RL="#8B7500", 
                    SL="#2F4F4F",SD="#008B8B",SP="#006400",SC="#BDB76B", SY="#ADFF2F", TM="#00CD66",RP="#00CDCD", WC="#BBFFFF")

map <- ggmap(map) + geom_point(data = basacoord, aes(x = Long, y = Lat),size=2,col="black",fill='white',pch=21) +
geom_label_repel(data = basacoord, aes(x = Long, y = Lat,label = Pop,fill=Pop),
colour = "black", fontface = "bold") +
    scale_fill_manual(values=group.colors) +
coord_map(xlim=c(-120.5,-109.0),ylim=c(37.00,49.00)) +
xlab("Longitude") + ylab("Latitude") + theme_bw() +
theme(legend.position = 'none',
axis.text = element_text(size=13),
axis.title = element_text(size = 15, colour="black",face = "bold", vjust = 1),
panel.border = element_rect(size = 1.5, colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
map
ggsave('map_basa.pdf',height=8,width=5)


PCA:


library(data.table)
library(ggplot2)
library(ggsci)
library(umap)
library(LEA)
library(readr)
library(ggpubr)
setwd("/home/caro/Escritorio/BASA_2/con_outgroup/PCA_UMAP/")

##### PCA  #####
PCA_gen <- function(df_gen,indv,num=5){ #add ggplot, add tw, add # of output
  #pkgTest("ggplot2")
  df_gen <- apply(df_gen, 2, function(df) gsub(-1,NA,df,fixed=TRUE))
  df_gen <- apply(df_gen, 2, function(df) as.numeric(df))
  colmean<-apply(df_gen,2,mean,na.rm=TRUE)
  normalize<-matrix(nrow = nrow(df_gen),ncol=ncol(df_gen))
  af<-colmean/2
  for (m in 1:length(af)){
    nr<-df_gen[ ,m]-colmean[m]
    dn<-sqrt(af[m]*(1-af[m]))
    normalize[ ,m]<-nr/dn
  }
  
  normalize[is.na(normalize)]<-0
  method1<-prcomp(normalize, scale. = FALSE,center = FALSE)
  pve <- summary(method1)$importance[2,]
  print(pve[1:50])
  if(nrow(df_gen) < num){
    num <- nrow(df_gen)
  }
  
  pca_X<- method1$x[,1:num]
  pca_X <- as.data.frame(pca_X)
  pca_X$Pop <- indv$Pop
  pca_X$ID <- indv$ID
  pca_out <- list("pca_df"=pca_X,"pve"=pve)
  #print(PCA_fig(pca_out))
  return(pca_out)
}

######################## PCA_fig() ###################

PCA_fig <- function(pca_out,fill="Pop"){ #add PCA setting or normal
  xlab <- paste("PCA1 (",round(pca_out$pve[1]*100,2),"%)",sep="")
  ylab <- paste("PCA2 (",round(pca_out$pve[2]*100,2),"%)",sep="")
  pca_df <- pca_out$pca_df
  filler <- as.character(pca_df[[fill]])
  ggplot(data = pca_df, aes(x=PC1,y=PC2,fill=filler)) + 
    geom_point(pch=21,colour='black',size = 3)+ 
    xlab(xlab) + ylab(ylab) +
    theme_bw() + 
    theme(#legend.position = 'none',
      axis.text = element_text(size=13), 
      axis.title = element_text(size = 16, colour="black",face = "bold",vjust = 1),
      panel.border = element_rect(size = 1.5, colour = "black"),
      legend.text = element_text(size=13),
      legend.title = element_text(size = 16, colour="black",face = "bold",vjust = 1),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank())
}

###### 
df012<-fread("basa.012",sep="\t", data.table=F) 
df012 <- df012[,-1]
Pop_ID_Sum <- read.csv('Pop_ID_sin.csv')

############# 
pca_out <- PCA_gen(df012,Pop_ID_Sum)
pve <- pca_out$pve[1:50]
pve

pca_df <- pca_out$pca_df[,1:50]
pca_df <- cbind(Pop_ID_Sum,pca_df)
write.csv(pca_df,'pca_df_.csv',row.names = FALSE)

############## 
pca_df <- read.csv("pca_df.csv")
pve <- c(0.06575, 0.03794, 0.02006, 0.01616, 0.01205) 

group.colors <- c(AN = "#F0F8FF", AS ="#E9967A", BH ="#FFEBCD", BB = "#F08080",CF = "#8B0000",CC ="#FF3030",CN ="#CD6090",CL = "#8B3A62", EC = "#FF82AB", GA 
                  ="#FFC0CB",GB ="#FFE1FF", GJ="#CDB5CD",JV ="#8B7B8B", KA ="#9B30FF",KB="#8470FF",KM ="#27408B",LM ="#87CEFA", 
                  LL="#104E8B",LC="#8DB6CD",MM="#6E7B8B",MT="#FFB90F",QG="#EE7600",RC="#FFF68F",RL="#8B7500", 
                  SL="#2F4F4F",SD="#008B8B",SP="#006400",SC="#BDB76B", SY="#ADFF2F", TM="#00CD66",RP="#00CDCD", WC="#BBFFFF"#,B.hookeri="#ffff00ff",
  #BAH03xB.hybrid="#000000ba")

PCA1VS2 <- ggplot(data = pca_df, aes(x=PC1,y=PC2,fill=as.character(Pop), color=Pop)) + 
  geom_point(pch=21,colour='black',size = 5) +
  xlab(paste("PC",1," (",pve[1]*100,"%)",sep="")) + ylab(paste("PC",2," (",pve[2]*100,"%)",sep=""))+
  scale_fill_manual(values=group.colors)+ theme_bw() + 
  theme(legend.position = 'right',
        axis.text = element_text(size=11), 
        axis.title = element_text(size = 13, colour="black",face = "bold",vjust = 1),
        panel.border = element_rect(size = 1.5, colour = "black"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())
ggsave("pca1_2.pdf",PCA1VS2,height=8,width = 10,units = 'in')


UMAP:


#### UMAP
custom.settings = umap.defaults
custom.settings$min_dist = 0.9
custom.settings$n_neighbors = 14

umap_g <- as.data.frame(umap(df012,config = custom.settings)$layout )
names(umap_g) <- c('layout1','layout2')
umap_g <- cbind(Pop_ID_Sum,umap_g)

#### UMAP gprob ####
umap_g_plot <- ggplot(data = umap_g, aes(x=layout1,y=layout2,fill=as.character(Pop))) +
  geom_point(colour='black',size = 5,pch=21) + #ggtitle("UMAP n_neighbors 14 min_dist 0.8") +
  xlab('Layout 1') + ylab('Layout 2') +
  scale_fill_manual(values=group.colors) + 
  theme_bw() +
  theme(legend.position = 'right',
        axis.text = element_text(size=11), 
        axis.title = element_text(size = 13, colour="black",face = "bold",vjust = 1),
        panel.border = element_rect(size = 1.5, colour = "black"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())
ggsave("umap_09_14.pdf",umap_g_plot,height=8,width = 10,units = 'in')


entropy:


#perl /working/mascaro/acth/entropy2/vcf2mpglV1.3TLP.pl basa.vcf
#perl /working/mascaro/acth/entropy2/gl2genestV1.3.pl basa.mpgl mean

########### PCA_entropy
##row = loci, col = indv 
require(readr)
require(MASS)
require(LEA)
require(ggplot2)

PCA_entropy <- function(g){
  colmean<-apply(g,2,mean,na.rm=TRUE)
  normalize<-matrix(nrow = nrow(g),ncol=ncol(g))
  af<-colmean/2
  for (m in 1:length(af)){
    nr<-g[ ,m]-colmean[m]
    dn<-sqrt(af[m]*(1-af[m]))
    normalize[ ,m]<-nr/dn
  }
  
  normalize[is.na(normalize)]<-0
  method1<-prcomp(normalize, scale. = FALSE,center = FALSE)
  pve <- summary(method1)$importance[2,]
  print(pve[1:5])
  pca_df<- method1$x[,1:27]
  return(pca_df)
} 

require(readr)
require(MASS)
require(LEA)
require(ggplot2)
g <- read.table("pntest_mean_variants_maf5_miss9_thin100_noBadInds.recode.txt", header=F)
Pop_ID_Sum <- read.csv("Pop_ID.csv")
pca_df <- PCA_entropy(t(g))

#Header for entropy:
Pop_ID <- read.csv("Pop_ID.csv")
Sp_Pop <- paste("AT",Pop_ID$Pop,sep="_")
Pop_ID <- paste(Pop_ID$Pop,Pop_ID$ID,sep="_")
Header <- data.frame(Sp_Pop,Pop_ID)
write.table(t(Header),'entropy_header.txt',sep = " ", quote = FALSE,row.names =FALSE)

dim(g)
#[1] loci_individuals

Pop_ID_Sum <- read.csv("Pop_ID_Sum.csv")
pca_df <- PCA_entropy(t(g)) ######change PCA_entropy to allow for selecting PC# output and tw 
test

library(MASS)

k2<-kmeans(pca_df[,1:5],2,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k3<-kmeans(pca_df[,1:5],3,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k4<-kmeans(pca_df[,1:5],4,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k5<-kmeans(pca_df[,1:5],5,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k6<-kmeans(pca_df[,1:5],6,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k7<-kmeans(pca_df[,1:5],7,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k8<-kmeans(pca_df[,1:5],8,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k9<-kmeans(pca_df[,1:5],9,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k10<-kmeans(pca_df[,1:5],10,iter.max=10,nstart=10,algorithm="Hartigan-Wong")

ldak2<-lda(x=pca_df[,1:5],grouping=k2$cluster,CV=TRUE)
ldak3<-lda(x=pca_df[,1:5],grouping=k3$cluster,CV=TRUE)
ldak4<-lda(x=pca_df[,1:5],grouping=k4$cluster,CV=TRUE)
ldak5<-lda(x=pca_df[,1:5],grouping=k5$cluster,CV=TRUE)
ldak6<-lda(x=pca_df[,1:5],grouping=k6$cluster,CV=TRUE)
ldak7<-lda(x=pca_df[,1:5],grouping=k7$cluster,CV=TRUE)
ldak8<-lda(x=pca_df[,1:5],grouping=k8$cluster,CV=TRUE)
ldak9<-lda(x=pca_df[,1:5],grouping=k9$cluster,CV=TRUE)
ldak10<-lda(x=pca_df[,1:5],grouping=k10$cluster,CV=TRUE)

write.table(round(ldak2$posterior,5),file="ldak2.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak3$posterior,5),file="ldak3.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak4$posterior,5),file="ldak4.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak5$posterior,5),file="ldak5.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak6$posterior,5),file="ldak6.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak7$posterior,5),file="ldak7.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak8$posterior,5),file="ldak8.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak9$posterior,5),file="ldak9.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak10$posterior,5),file="ldak10.txt",quote=F,row.names=F,col.names=F)

#cat entropy_header.txt basa.mpgl > good_snps.entropy.mpgl
## add 584 36161 1 to the top of entropy_header.txt y dejas esa linea vacia! mirar el ejemplo 
mio#########
entropy -i basa_entropy.mpgl -o basa_k2.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 2 -q 
ldak2.txt -m 1 -w 0 &> k2stdout.txt
entropy -i basa_entropy.mpgl -o basa_k3.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 3 -q 
ldak2.txt -m 1 -w 0 &> k3stdout.txt
entropy -i basa_entropy.mpgl -o basa_k4.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 4 -q 
ldak4.txt -m 1 -w 0 &> k4stdout.txt
entropy -i basa_entropy.mpgl -o basa_k5.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 5 -q 
ldak5.txt -m 1 -w 0 &> k5stdout.txt
entropy -i basa_entropy.mpgl -o basa_k6.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 6 -q 
ldak6.txt -m 1 -w 0 &> k6stdout.txt
entropy -i basa_entropy.mpgl -o basa_k7.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 7 -q 
ldak7.txt -m 1 -w 0 &> k7stdout.txt
entropy -i basa_entropy.mpgl -o basa_k8.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 8 -q 
ldak8.txt -m 1 -w 0 &> k8stdout.txt
entropy -i basa_entropy.mpgl -o basa_k9.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 9 -q 
ldak9.txt -m 1 -w 0 &> k9stdout.txt
entropy -i basa_entropy.mpgl -o basa_k10.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 10 -q 
ldak10.txt -m 1 -w 0 &>
  k10stdout.txt 

### DIC value
module load entropy/1.2

###Get the DICs values for each K value:

estpost.entropy basa_k2.hdf5 -s 3 -p deviance > DIC_k2.txt
estpost.entropy basa_k3.hdf5 -s 3 -p deviance > DIC_k3.txt
estpost.entropy basa_k4.hdf5 -s 3 -p deviance > DIC_k4.txt
estpost.entropy basa_k5.hdf5 -s 3 -p deviance > DIC_k5.txt
estpost.entropy basa_k6.hdf5 -s 3 -p deviance > DIC_k6.txt
estpost.entropy basa_k7.hdf5 -s 3 -p deviance > DIC_k7.txt
estpost.entropy basa_k8.hdf5 -s 3 -p deviance > DIC_k8.txt
estpost.entropy basa_k9.hdf5 -s 3 -p deviance > DIC_k9.txt
estpost.entropy basa_k10.hdf5 -s 3 -p deviance > DIC_k10.txt

###Generate entropy q files:

estpost.entropy basa_k2.hdf5 -p q -s 0 -o q2.txt
estpost.entropy basa_k3.hdf5 -p q -s 0 -o q3.txt
estpost.entropy basa_k4.hdf5 -p q -s 0 -o q4.txt
estpost.entropy basa_k5.hdf5 -p q -s 0 -o q5.txt
estpost.entropy basa_k6.hdf5 -p q -s 0 -o q6.txt
estpost.entropy basa_k7.hdf5 -p q -s 0 -o q7.txt
estpost.entropy basa_k8.hdf5 -p q -s 0 -o q8.txt
estpost.entropy basa_k9.hdf5 -p q -s 0 -o q9.txt
estpost.entropy basa_k10.hdf5 -p q -s 0 -o q10.txt

###Generate gprobs file:

estpost.entropy  basa_k2.hdf5 -p gprob -s 0 -o gprob.txt

# plotting
library(ggplot2)
library(forcats)
library(ggthemes)
library(patchwork)

kdf2 <- read.csv("/home/caro/Escritorio/BASA_2/con_outgroup/bcftools/entropy/definitivo/q2.csv"
                 )
k2plot <-
  ggplot(kdf2, aes(factor(sampleID), prob, fill = factor(popGroup))) +
  geom_col(color = "gray", size = 0.1) +
  facet_grid(~fct_inorder(loc), switch = "x", scales = "free", space = "free") +
  theme_minimal() + labs(x = "Populations", title = "K=2", y = "Ancestry") +
  scale_y_continuous(expand = c(0, 0)) +
  scale_x_discrete(expand = expansion(add = 1)) +
  theme(
    panel.spacing.x = unit(0.1, "lines"),
    axis.text.x = element_blank(),
    panel.grid = element_blank()
  ) +
  scale_fill_gdocs(guide = FALSE)

k2plot
ggsave("k2.pdf",k3plot,height=8,width = 20,units = 'in')


Admixture piecharts:


## Admixture pie chart

BiocManager::install("LEA")
# Load packages
library(adegenet)
library(poppr)
library(LEA)
library(reshape2)
library(dplyr)
library(ggplot2)
library(rworldmap)
library(rworldxtra)
library(ggsn)
library(sf)
library(raster)
library(rgeos)
library(maps)
library(maptools)
library(grid)
library(miscTools)
library(stringr)
library(ggpubr)

qmatrix<-read.csv("Desktop/q4.csv",header=TRUE, sep = ";")
qmatrix
# Label column names of qmatrix
ncol(qmatrix)
head(qmatrix)

qlong = melt(qmatrix, id.vars=c("ID","Pop"))
head(qlong)

pal = colorRampPalette(c("red","blue", "green", "light blue"))
cols = pal(length(unique(qlong$variable)))


admix.bar = ggplot(data=qlong, aes(x=ID, y=value, fill=variable))+
  geom_bar(stat = "identity")+
  scale_y_continuous(expand = c(0,0))+
  facet_wrap(~Pop, scales = "free", ncol = 2)+
  scale_fill_manual(values = cols)+
  ylab("Admixture proportion")+
  # xlab("Individual")+
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.title.x = element_blank(),
        strip.text = element_text(colour="black", size=12),
        panel.grid = element_blank(),
        panel.background = element_blank(),
        legend.position = "top",
        legend.title = element_blank(),
        legend.text = element_text(size = 12))
admix.bar
ggsave("Desktop/1.admixture_barplot.png", width=6, height=10, dpi=300)


# Calculate mean admixture proportions for each site
head(qmatrix)
clusters = grep("K", names(qmatrix)) # indexes of cluster columns
avg_admix = aggregate(qmatrix[, clusters], list(qmatrix$Pop), mean)

# Order alphabetically by site
avg_admix = avg_admix[order(as.character(avg_admix$Group.1)), ]
avg_admix

# Convert dataframe from wide to long format
avg_admix = melt(avg_admix, id.vars = "Group.1")
head(avg_admix)

# Define a function to plot pie charts using ggplot for each site
pie_charts = function(admix_df, site, cols){
  # admix_df = dataframe in long format of admixture proportions per site 
  # site = string 
  # cols = vector of colours of length(clusters)
  ggplot(data = subset(admix_df, Group.1 == site),
         aes(x = "", y = value, fill = variable))+
    geom_bar(width = 1, stat = "identity", colour = "black", show.legend = FALSE)+
    coord_polar(theta = "y")+
    scale_fill_manual(values = cols)+
    theme_void()
}

# Test function on one site
pie_charts(avg_admix, site = "AN", cols = cols)


# Subset data to reduce computation time
subsites = sort(c("AN", "AS", "BB", "BH", "CC", "CF", "CL", "CN", "EC", "GA", "GB", "GJ", 
                  "JV","KA", "KB", "KM", "LC", "LL", "LM", "MM", "MT", "QG", "RC", "RL", "RP", 
                  "SC", "SD", "SL", 
                  "SP", "SY", "TM", "WC"))


# Apply function to all sites using for loop
pies = list()
for (i in subsites){
  pies[[i]] = pie_charts(admix_df = avg_admix, site = i, cols = cols) 
}


# Prepare basemap

# Import csv file containing coordinates
coords = read.csv("Desktop/coordinates.csv", sep = ";")

# Subset coordinates
coords = coords[coords$Pop %in% subsites, ]

# Order alphabetically by site
coords = coords[order(coords$Pop), ] 
coords

# Check order matches coords order
as.character(avg_admix$Group.1) == as.character(coords$Pop)

# Set map boundary (xmin, xmax, ymin, ymax)
boundary = extent(-120.5, -109, 37, 49)
boundary

# Get map outlines from rworldmap package
map.outline = getMap(resolution = "high")

# Crop to boundary and convert to dataframe
map.outline = crop(map.outline, y = boundary) %>% fortify()

# Plot basemap
basemap = ggplot()+
  geom_polygon(data=map.outline, aes(x=long, y=lat, group=group), fill="grey",
               colour="black", size=0.5)+
  coord_quickmap(expand=F)+
  #lggsn::north(map.outline, symbol = 10, scale = 0.06, location = "topleft")+
  #lggsn::scalebar(data = map.outline, dist = 50, dist_unit = "km", height = 0.01,
  #ltransform = TRUE, model = "WGS84" )+
                 #location = "bottomleft", anchor = c(x = -114, y = 38),
                 #st.bottom = FALSE, st.size = 4, st.dist = 0.015)+
  xlab("Longitude")+
  ylab("Latitude")+
  theme(
    axis.text = element_text(colour="black", size=12),
    axis.title = element_text(colour="black", size=14),
    panel.background = element_rect(fill="lightsteelblue2"),
    panel.border = element_rect(fill = NA, colour = "black", size = 0.5),
    panel.grid.minor = element_line(colour="grey90", size=0.5),
    panel.grid.major = element_line(colour="grey90", size=0.5),
    legend.text = element_text(size=12),
    legend.title = element_blank(),
    legend.key.size = unit(0.7, "cm"),
    legend.position = "top"
  )
basemap

coord.list = list()
for (i in subsites){
  coord.list[[i]] = c(subset(coords, Pop == i)$Lon, subset(coords, Pop == i)$Lat)
}
coord.list

# Define pie chart sizes
radius = 0.45

# Convert ggplot pie charts to annotation_custom layers
pies.ac = list()
for (i in 1:length(subsites)){
  pies.ac[[i]] = annotation_custom(grob = ggplotGrob(pies[[i]]),
                                   xmin = coord.list[[i]][[1]] - radius,
                                   xmax = coord.list[[i]][[1]] + radius,
                                   ymin = coord.list[[i]][[2]] - radius,
                                   ymax = coord.list[[i]][[2]] + radius)
}

# Add layers to basemap
pie.map = basemap + pies.ac
pie.map
ggsave("Desktop/4.pie_charts_map.png", width = 8, height = 10, dpi = 300)

# Combine ggplots
ggarrange(admix.bar + theme(legend.position = "right") + labs(title = "Individual admixture proportions", tag = "A"),
          pie.map + labs(title = "Mean admixture proportions for each site", tag = "B"))
ggsave("Desktop/4.Admixture_bar_map.png", width = 30, height = 10, dpi = 300)


IBD, IBE:



EEMS, unPC:


#EEMS

diffs <- read.table("./test/example-SNP-major-mode.diffs")
diffs

datapath = ./data/inputdata
mcmcpath = ./data/outputdata
nIndiv = 246
nSites = 5677
nDemes = 300
diploid = false
numMCMCIter = 200000000
numBurnIter = 100000000
numThinIter = 999999

./runeems_snps --params configurationfile.ini --seed 123 (randome seed, optional)
./runeems_snps --params configurationfile2.ini --seed 123 
./runeems_snps --params configurationfile3.ini --seed 123 

EEMS results visualization:
library(rEEMSplots)

mcmcpath <- c("./eems_output/output_1", "./eems_output/output_2", "./eems_output/output_3")
plotpath = (""./eems_output/plots"")

#unPC

library(unPC)

unPC(inputToProcess  = "pca_eigen.evec", outputPrefix = "unPC",
     runDataImport = TRUE, runPairwiseCalc = TRUE, geogrCoords= "Lon_Lat.txt",
     roundEarth = TRUE, firstPC = 1, secondPC = 2, runPlotting = TRUE,
     applyManualColor = FALSE, geogrCoordsForPlotting = "Lon_Lat_popnAggregated.txt", 
     plotWithMap = FALSE,
     ellipseWidth = 0.05, populationPointNormalization = 7,
    runAggregated = TRUE, savePlotsToPdf = TRUE)

unpcout <- readRDS("unPC_pairwiseDistCalc_unPC.rds")
unPC <- unpcout$ratioPCToGeogrDist 
write.csv(unpcout, "unPCscores.csv")


Influence of environment

pRDA:


############## RDA
library(adegenet)
library(poppr)
library(tidyverse)
library(vcfR)
library(hierfstat)
library(pegas)
library(magrittr)
library(ape)
library(psych)
library(adespatial)
library(vegan)

## Loading the genetic data 
setwd("/home/caro/Escritorio/BASA_2/con_outgroup/bcftools/pRDA/")
##setwd("/home/caro/Escritorio/")
gen <- read.csv("basa_prda.012", row.names=1)
##gen <- read.csv("acth_012.csv", row.names=1)
dim(gen)
sum(is.na(gen))
gen.imp <- apply(gen, 2, function(x) replace(x, is.na(x), as.numeric(names(which.max(table(x))))))

## Env. variables
Env <- read.csv("/home/caro/Escritorio/BASA_2/con_outgroup/bcftools/pRDA/Env.csv", header = TRUE)
Coordinates <- read.csv("/home/caro/Escritorio/BASA_2/con_outgroup/bcftools/pRDA/Lon_Lat.csv")
PopStruct <- read.csv("/home/caro/Escritorio/BASA_2/con_outgroup/bcftools/pRDA/pca_df_.csv")

## Table gathering all variables
Variables <- data.frame(Env, Coordinates, PopStruct)
Variables[1:5,]

## Null model
RDA0 <- rda(gen.imp ~ 1,  Variables) 
## Full model
RDAfull <- rda(gen.imp ~ ppt+tdmean+tmax+tmean+tmin+vpdmax+vpdmin+slope+Af+soilawc+sloprad+afrad+heatload+CumlWs+DurCWD+OnsetCWD+CumlAET+CumlCWD+CumlPET+WsAE
               Tspr+FallAET+MaxCWD+pcseas+CumlGDD+AETgdd+DurGDD+mxtmpfal+mxtmpspr+mxtmpsum+mxtmpwin+mntmpfal+ mntmpspr+ 
                 mntmpsu+mntmpwin+prcpfal+prcpspr+prcpsum+prcpwin+Elevation+ mndtmpfal+mndtmpspr+mndtmpsu+mndtmpwin+Lon+Lat+PC1+PC2, Variables)
#Quito esta: +DeclAET y quito los guiones bajos de ID

## "To conduct the selection procedure we used the ordiR2step function of the package vegan and the following stopping criteria: variable significance of p <
0.01 using 1000 permutations, and the adjusted R2 of the global model." 
mod <- ordiR2step(RDA0, RDAfull, Pin = 0.01, R2permutations = 1000, R2scope = T, direction = T)
mod$anova

## Removing correlated predictors:
env <- subset(Variables, select=c(tmin,vpdmin,sloprad,prcpfal,mndtmpfal,tdmean,mxtmpfal,mntmpsu,mxtmpsum,mndtmpsu,mndtmpwin,AETgdd,CumlAET,soilawc,prcpwin,Cu
                                  mlCWD,mntmpfal,MaxCWD,CumlGDD,tmax,tmean,pcseas,afrad,mntmpspr,OnsetCWD,mndtmpspr,CumlWs,mntmpwin,DurCWD))
str(env)

correlation <-cor(env, method = "pearson")
write.csv(correlation, "/home/caro/Escritorio/BASA_2/con_outgroup/bcftools/pRDA/correlation1.csv")
env2 <- subset(Variables, 
               
               select=c(sloprad,prcpfal,mndtmpfal,tdmean,mxtmpfal,mntmpsu,mxtmpsum,soilawc,prcp
                        
                        win,CumlCWD,MaxCWD,CumlGDD,tmax,pcseas,afrad,OnsetCWD,CumlWs))

correlation2 <-cor(env2, method = "pearson")
write.csv(correlation2, "/home/caro/Escritorio/BASA_2/con_outgroup/bcftools/pRDA/correlation2.c
          sv")
env3 <- subset(Variables, 
               select=c(sloprad,tdmean,mxtmpfal,mxtmpsum,soilawc,prcpwin,tmax,pcseas,afrad,Onse
                        tCWD,CumlWs))

pairs.panels(env3, scale = TRUE, density =TRUE, ellipses =TRUE, methods="pearson")
pdf("/home/caro/Escritorio/BASA_2/con_outgroup/bcftools/pRDA/Pairs_pannel3.pdf")
#######

####### Selected: sloprad,tdmean,mxtmpfal,mxtmpsum,soilawc,prcpwin,tmax,pcseas,afrad,OnsetCWD,CumlWs

## Full model
pRDAfull <- rda(gen.imp ~ PC1 + PC2+ + Lon + Lat+ sloprad+tdmean+mxtmpfal+mxtmpsum+soilawc+prcpwin+tmax+pcseas+afrad+OnsetCWD+CumlWs, Variables)
pRDAfull
#RsquareAdj(pRDAfull)
#anova(pRDAfull)

## Pure climate model
pRDAclim <- rda(gen.imp ~ sloprad+tdmean+mxtmpfal+mxtmpsum+soilawc+prcpwin+tmax+pcseas+afrad+OnsetCWD+CumlWs + Condition(PC1 + PC2+ Lon + Lat),Variables)
pRDAclim
#RsquareAdj(pRDAclim)
#anova(pRDAclim)

# Model summaries
#RsquareAdj(pRDAclim) # adjusted Rsquared 
#The eigenvalues for the constrained axes reflect the variance explained by each canonical axis:
#summary(pRDAclim, display=NULL) #display = NULL optional  
#summary(eigenvals(pRDAclim, model = "constrained"))
#screeplot(pRDAclim)

#check our RDA model for significance using formal tests. We can assess both the full model and each constrained axis using F-statistics (Legendre et al, 
2010). The null hypothesis is that no linear relationship exists between the SNP data and the environmental predictors. 
#anova.cca(pRDAclim, permutations = 100) # full model
#anova.cca(pRDAclim, permutations = 100, by="margin") # per variable 

#signif.axis <- anova.cca(pRDAclim, by="axis", parallel=getOption("mc.cores"))
#signif.axis
#vif.cca(pRDAclim) # variance inflation factor (<10 OK)

# Model: rda(formula = gen.imp ~ AETgdd + AfRad + FallAET + MaxCWD + mxtmpwin + prcpsum + SlopRad + WsAETspr + Condition(PC1 + PC2 + Lon + Lat), data = 
Variables)
#           Df Variance      F   Pr(>F)   
# AETgdd     1   11.505 15.742 0.009901 **
# AfRad      1   17.800 24.356 0.009901 **
# FallAET    1   14.477 19.808 0.009901 **
# MaxCWD     1   21.014 28.754 0.009901 **
# mxtmpwin   1   15.724 21.516 0.009901 **
# prcpsum    1   16.850 23.056 0.009901 **
# SlopRad    1   13.357 18.276 0.009901 **
# WsAETspr   1   15.852 21.691 0.009901 **


## Pure neutral population structure model  
pRDAstruct <- rda(gen.imp ~ PC1 + PC2 + Condition(Lon + Lat + sloprad+tdmean+mxtmpfal+mxtmpsum+soilawc+prcpwin+tmax+pcseas+afrad+OnsetCWD+CumlWs),  
                  Variables)
pRDAstruct
#RsquareAdj(pRDAstruct)
#anova(pRDAstruct)

##Pure geography model 
pRDAgeog <- rda(gen.imp ~ Lon + Lat + Condition(sloprad+tdmean+mxtmpfal+mxtmpsum+soilawc+prcpwin+tmax+pcseas+afrad+OnsetCWD+CumlWs + PC1 + PC2),  Variables)
pRDAgeog
#RsquareAdj(pRDAgeog)
#anova(pRDAgeog)


Table 3.
Partial.RDA.models Inertia R2 P…..F.. Proportion.of.explainable.variance Proportion.of.total.variance
Full model: gen ~ env + structur + geog 30.45 0.09 0.001*** 1.000 0.090
Pure climate: gen ~ env (structur + geog) 14.71 0.04 0.001*** 0.483 0.043
Pure structure: gen ~ structur (env + geog) 6.04 0.02 0.001*** 0.182 0.016
Pure geography: gen ~ geog (env + structur) 3.13 0.01 0.001*** 0.115 0.010
NA NA NA NA
Counfounded climate/structure/geography 6.57 NA 0.233 0.021
NA NA NA NA
Total unexplained 298.20 NA NA NA
Total inertia 328.65 NA NA NA

Influence of hybridization

Phylogeny IQtree:
Based on the phylogeny, B. hookeri and B. hybrid are embedded within B. sagittata.


### vcf to nexus (binary) for PhyloNet
python vcf2phylip -i basa_outg.vcf -o LW -b

./iqtree -s basa.phy -nt 10 -m GTR+ASC -bb 1000


Fbranch:
Based on Fbranch results (very conservative parameters) there is gene flow among some B. sagittata populations and B. hookeri and B. hybrid. There is also some ancestral gene flow among B. sagittata and basal clades of B. hookeri and B. hybrid.


./Dsuite Dtrios basa.vcf SETS.txt -t tree_trimmed.tree
./Dsuite Fbranch -Z --Pb-matrix tree_trimmed.tree SETS_tree.txt > fbranch_collapsed_p.txt

head -n 79 fbranch_collapsed_p.txt > f_plotting.txt

#read in z-scores
zscores <- read.csv("f_plotting.txt", sep = "\t")
zscores[zscores < 0.05] <- 0
write.table(zscores, "f_5_percent_plotting_trimmed.txt", sep='\t',row.names=F, quote=F)

./dtools.py f_5_percent_plotting_trimmed.txt tree_trimmed.tree --outgroup LW_ME_2 --ladderize


PhyloNet:

Two reticulations the most optimal.


### Trimming the vcf:
vcftools --vcf basa_final.vcf --remove remove.txt (a single individual per population, except for the outgroups)
Running PhyloNet from  0 to 10 networks:
# nexus file, each sample with a single letter different ID
# First 0 network, then 1 network, 2 etc.. based on the former network (see example file), 10 iterations
  
java -jar PhyloNet.jar phylonet_todos0.nex > results.txt


Genetic distances with outgroups:
B. hookeri seems to be more differentiated from B. sagitatta and the B. hybrid based on pairwise genetic distances.


#### G.distances
##python vcf2phylip.py -i basa_final.vcf -fasta

library(reshape)
library(dplyr)
library(gdata)
library(ape)
library(ggplot2)
library(ggrepel)
library(ggExtra)

dna_aligned<-read.FASTA("basa_final.min4.fasta", type="DNA")
dna_distances<-as.matrix(dist.dna(dna_aligned, model="raw",pairwise.deletion=T)*100)
write.csv(dna_distances, "pairwise_distances.csv")
diag(dna_distances)<- NA
data<- melt(dna_distances)
p<-ggplot(data = data, aes(x = X2, y = value, label = X1)) + 
  geom_point() + scale_y_continuous(breaks=seq(0,5,1)) +
  xlab('Taxa')+ ylab("Genetic distance % (raw)")+theme_classic()+theme(axis.text.x = element_text(angle = 
  90,hjust=0.65,vjust=0.1),panel.background = element_rect(fill = NA),                                                              
  panel.grid.major = element_line(colour = "lightgray"),panel.grid.major.y=element_line(size = NA),
  panel.ontop = F) + geom_hline(yintercept=3, linetype="dashed", color ="gray")

p1 <-ggMarginal(p, margins = "y", type="histogram")
ggsave(plot=p1,"genetic_distances.pdf", width=30,height=15)